from datasets import *
from transformers import AutoTokenizer

dataset = load_dataset(r"D:\datasets\new-title-chinese")
tokenizer = AutoTokenizer.from_pretrained(r"D:\models\rbt3")

def preprocess_fun(example):
    model_input = tokenizer(example['content'],max_length=512,truncation=True)
    labels = tokenizer(example['title'],max_length=32,truncation=True)
    model_input['labels'] = labels['input_ids']
    return model_input

pro_data = dataset.map(preprocess_fun,batched=True,remove_columns=dataset['train'].column_names)
# print(pro_data)

pro_data.save_to_disk(r"D:\codes\sshcode\HuggingFace\save_dataset")
